## This file prepares CCES 2016 survey data for analysis and merges original burden measures ##
## CCES data downloaded from: https://doi.org/10.7910/DVN/GDF6Z0 ##
## Created by Meredith Dost and last run 8/19/2025 ##

# load packages
library(haven)
library(tidycensus)
library(tidyr)

# set working directory
#setwd("")

# read in data
c16 <- read_dta("other_data/cces2016/CCES16_Common_OUTPUT_Feb2018_VV.dta")
## recoding variables for analyses
c16$age <- 2016 - c16$birthyr
c16$female <- NA
c16$female[c16$gender==2] <- 1
c16$female[c16$gender==1] <- 0
c16$faminc[c16$faminc>16] <- NA
c16$whitenh <- 0
c16$whitenh[c16$race==1 & c16$hispanic!=1] <- 1
c16$blacknh <- 0
c16$blacknh[c16$race==2 & c16$hispanic!=1] <- 1
c16$othernh <- 0
c16$othernh[c16$race>=4 & c16$race<=7 & c16$hispanic!=1] <- 1
c16$hisp <- 0
c16$hisp[c16$hispanic==1 | c16$race==3] <- 1
c16$fips_state <- as.numeric(c16$inputstate_post)

# validated voter
c16$vv <- NA
c16$vv[c16$CL_E2016GVM==""] <- 0
c16$vv[c16$CL_E2016GVM!=""] <- 1
c16$vv[c16$CC16_401!=5] <- 0

# likely Medicaid
c16$med <- ifelse(c16$healthins_2==1,1,0)

##### merge in other data ####
## Medicaid expansion status
exp <- read.csv("other_data/Medicaid_expansion_status.csv")
# subset to 2016
exp16 <- exp
exp16$exp <- ifelse(exp16$year_implemented>=2014 & exp16$year_implemented<=2016, 1, 0)
exp16$exp[is.na(exp16$year_implemented)] <- 0
exp16$exp_alt <- ifelse(exp16$year_implemented==2014,"first",
                        ifelse(exp16$year_implemented %in% c(2015,2016),"next",0))
exp16$exp_alt[is.na(exp16$exp_alt)] <- 0
exp16$exp_alt <- factor(exp16$exp_alt,levels=c(0,"first","next"))
exp16 <- exp16[c("State","exp","exp_alt")]
rm(exp)

## Voting burden
vburd <- read.csv("burden_data/electburden_measures.csv")
# subset to 2016
vburd16 <- subset(vburd, year==2016)
vburd16 <- vburd16[,c(1,3:4)]
rm(vburd)

## Medicaid burden
kff <- read.csv("burden_data/medburden_measures.csv")
# subset to 2016
kff16 <- subset(kff, year==2016)
kff16 <- kff16[,c(1,3)]
names(kff16)[2] <- "negsum_2016"
kff12 <- subset(kff, year==2012)
kff12 <- kff12[,c(1,3)]
names(kff12)[2] <- "negsum_2012"
kff <- merge(kff16,kff12, by = "State")
kff$negsum1612 <- kff$negsum_2016-kff$negsum_2012
rm(kff16,kff12)

## State name-abbreviation crosswalk
st_ab <- read.csv("other_data/state_stateabbr.csv")
names(st_ab)[2] <- c("state")

## Get state-level ACS data on Medicaid receipt
# tell the tidycensus package what your census API key is (and put in quotations)
# census_api_key("", install = TRUE, overwrite=TRUE)
### choose variables
vars.16 <- c("DP05_0001E",# total population 
             "S2704_C02_023", # MEDICAID/MEANS-TESTED PUBLIC COVERAGE ALONE OR IN COMBINATION!!Under 18
             "S2704_C02_024", # MEDICAID/MEANS-TESTED PUBLIC COVERAGE ALONE OR IN COMBINATION!!18 to 64 years
             "S2704_C02_025") # MEDICAID/MEANS-TESTED PUBLIC COVERAGE ALONE OR IN COMBINATION!!65 years and over
### import data
acs16 <- get_acs(geography = "state", 
                 variables = vars.16, 
                 year = 2016, survey = "acs1")
### clean data
acs16$NAME <- NULL
acs16$moe <- NULL
data_wide <- spread(acs16, GEOID, estimate)
df2 <- data.frame(t(data_wide[-1]))
colnames(df2) <- c("pop_total","med_u18","med_1864","med_65p")
df2$fips_state <- as.numeric(rownames(df2))
df2$med_tot <- df2$med_u18+df2$med_1864+df2$med_65p
df2$pct_med <- df2$med_tot/df2$pop_total
df2$highmed <- ifelse(df2$pct_med>=median(df2$pct_med, na.rm=T), 1, 0)
### keep only variables needed
acs <- df2[c("fips_state","highmed","pct_med")]
rm(acs16,df2,data_wide,vars.16)

## get SAHIE data
# read in original data
sahie <- read.csv("demographic_data/input_data/sahie_2015_modified_colnames.csv", row.names=NULL)
# subset to vars needed & rows that are: 18-64 year-olds; all races; all sexes; state-level estimates; at or below 138% of FPL AND all incomes
sahie <- subset(sahie, agecat==1 & racecat==0 & sexcat==0 & geocat==40 & (iprcat==3 | iprcat==0))
sahie <- sahie[c("statefips","NIPR","iprcat")]
sahie$NIPR <- as.numeric(as.character(sahie$NIPR))
# change format from long to wide
sahie <- pivot_wider(sahie, names_from = "iprcat", values_from = "NIPR")
colnames(sahie) <- c("fips_state","pop_18to64","pop_18to64_lt138inc")
# compute percent age 18-64 w/<138% FPL
sahie$pcteligib <- sahie$pop_18to64_lt138inc/sahie$pop_18to64
# compute indicator of if pcteligib is >= median national value in this year
sahie$higheligib <- ifelse(sahie$pcteligib>=median(sahie$pcteligib, na.rm=T), 1, 0)
sahie <- sahie[c("fips_state","higheligib")]

#### merging data together ####
data <- merge(exp16,kff, by = "State")
data <- merge(data, st_ab, by = "State")
data <- merge(data,vburd16, by = "state")
data <- merge(data,acs, by = "fips_state")
data <- merge(data,sahie, by = "fips_state")
d <- merge(data,c16, by = c("fips_state"))

## keep only variables needed for analysis and remove observations with NAs
d2 <- d[c("vv","negsum1612","fips_state","V101","commonweight_vv_post","exp_alt",
          "turnidx","regidx","highmed","pct_med","higheligib","whitenh","blacknh","othernh","hisp",
          "faminc","age","educ","female","med")]
d2 <- na.omit(d2)

## add region variable
d2$region <- NA
d2$region[d2$fips_state %in% c(9,23,25,33,44,50)] <- "NewEngland"
d2$region[d2$fips_state %in% c(34,36,42)] <- "MidAtlantic"
d2$region[d2$fips_state %in% c(17,18,26,39,55)] <- "ENCentral"
d2$region[d2$fips_state %in% c(19,20,27,29,31,38,46)] <- "WNCentral"
d2$region[d2$fips_state %in% c(10,11,12,13,24,37,45,51,54)] <- "SAtlantic"
d2$region[d2$fips_state %in% c(1,21,28,47)] <- "ESCentral"
d2$region[d2$fips_state %in% c(5,22,40,48)] <- "WSCentral"
d2$region[d2$fips_state %in% c(4,8,16,30,32,35,49,56)] <- "Mountain"
d2$region[d2$fips_state %in% c(2,6,15,41,53)] <- "Pacific"
d2$region <- as.factor(d2$region)

##### save out data
write.csv(d2, "other_data/cces_burden_final.csv", row.names=F)
